notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd

Series
- pandas 기본 객체로 DataFrame과 함께 빈번하게 사용
- ndarray 기반 인덱싱 추가(1차원 배열)



In [2]:

    
series = pd.Series(20)
series









    



0    20
dtype: int64



In [3]:

    
series[0]



In [4]:

    
series2 = pd.Series(range(1, 11))
series2









    



0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int32

index, value select



In [5]:

    
print(series2.values)
print(series2.index)









    



[ 1  2  3  4  5  6  7  8  9 10]
RangeIndex(start=0, stop=10, step=1)

index 지정



In [6]:

    
series3 = pd.Series(range(1, 5), index=['a', 'b', 'c', 'd'])
print(series3)
print('-'*50)
print(series3.values)
print('-'*50)
print(series3.index)









    



a    1
b    2
c    3
d    4
dtype: int32
--------------------------------------------------
[1 2 3 4]
--------------------------------------------------
Index(['a', 'b', 'c', 'd'], dtype='object')

value select
- loc: index
- iloc: 0-based index



In [7]:

    
series3









    Out[7]:





a    1
b    2
c    3
d    4
dtype: int32



In [8]:

    
print(series3[1])
print(series3['b'])

2
2



In [9]:

    
series3[['a', 'b']]









    Out[9]:





a    1
b    2
dtype: int32



In [10]:

    
print(series3.loc['a'])
print(series3.iloc[0])

1
1



In [11]:

    
print(series3.loc[['a', 'b']])
print(series3.iloc[[0, 1]])









    



a    1
b    2
dtype: int32
a    1
b    2
dtype: int32

series create and index reuse



In [12]:

    
series4 = pd.Series(1, index=series3.index)
series4









    Out[12]:





a    1
b    1
c    1
d    1
dtype: int64



In [13]:

    
series5 = pd.Series(np.random.randn(5))
series5









    Out[13]:





0   -0.744492
1    2.135461
2   -0.447777
3    0.043428
4   -0.240325
dtype: float64



In [14]:

    
series6 = pd.Series({'math':100, 'sci':80})
series6









    Out[14]:





math    100
sci      80
dtype: int64

size, shape, unique, count



In [15]:

    
s = pd.Series([2, 1, 2, 3, np.nan])
s









    Out[15]:





0    2.0
1    1.0
2    2.0
3    3.0
4    NaN
dtype: float64



In [16]:

    
print(len(s))
print(s.size)

5
5



In [17]:

    
# return tuple
s.shape









    Out[17]:





(5,)



In [18]:

    
# counting
s.count()









    Out[18]:





4



In [19]:

    
# except duplicates
s.unique()









    Out[19]:





array([  2.,   1.,   3.,  nan])



In [20]:

    
# count values except NaN
s.value_counts()









    Out[20]:





2.0    2
3.0    1
1.0    1
dtype: int64

head, tail, take
- 자료 부분 출력



In [21]:

    
# top 5 elements
s.head()









    Out[21]:





0    2.0
1    1.0
2    2.0
3    3.0
4    NaN
dtype: float64



In [22]:

    
# low 5 elements
s.tail()









    Out[22]:





0    2.0
1    1.0
2    2.0
3    3.0
4    NaN
dtype: float64



In [23]:

    
s.head(n=2)









    Out[23]:





0    2.0
1    1.0
dtype: float64



In [24]:

    
# take by 0-based index
s.take([0, 1])









    Out[24]:





0    2.0
1    1.0
dtype: float64



In [25]:

    
s = pd.Series(range(1, 3), index=['a', 'b'])
s.take(['a', 'b'])









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-25-42c73aa60c4d> in <module>()
      1 s = pd.Series(range(1, 3), index=['a', 'b'])
----> 2 s.take(['a', 'b'])

D:\Anaconda3\lib\site-packages\pandas\core\series.py in take(self, indices, axis, convert, is_copy, **kwargs)
   2430         # check/convert indicies here
   2431         if convert:
-> 2432             indices = maybe_convert_indices(indices, len(self._get_axis(axis)))
   2433 
   2434         indices = _ensure_platform_int(indices)

D:\Anaconda3\lib\site-packages\pandas\core\indexing.py in maybe_convert_indices(indices, n)
   1866             return np.empty(0, dtype=np.int_)
   1867 
-> 1868     mask = indices < 0
   1869     if mask.any():
   1870         indices[mask] += n

TypeError: unorderable types: numpy.ndarray() < int()

calculating by index



In [1]:

    
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])

print(s1)
print(s2)









    



a    1
b    2
c    3
d    4
dtype: int64
d    4
c    3
b    2
a    1
dtype: int64



In [2]:

    
s1 + s2









    Out[2]:





a    2
b    4
c    6
d    8
dtype: int64



In [3]:

    
# element-wise in numpy
a1 = np.array([1, 2, 3, 4])
a2 = np.array([4, 3, 2, 1])

a1 + a2









    Out[3]:





array([5, 5, 5, 5])



In [4]:

    
s1 * s2









    Out[4]:





a     1
b     4
c     9
d    16
dtype: int64



In [5]:

    
s1 ** 3









    Out[5]:





a     1
b     8
c    27
d    64
dtype: int64



In [6]:

    
s3 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'f'])
s4 = pd.Series([4, 3, 2, 1, 0], index=['d', 'c', 'b', 'a', 'g'])

s3 + s4









    Out[6]:





a    2.0
b    4.0
c    6.0
d    8.0
f    NaN
g    NaN
dtype: float64

handling NA in pandas



In [7]:

    
np_array = np.array([1, 2, 3, np.NaN])
pd_series = pd.Series([1, 2, 3, np.NaN])



In [8]:

    
np_array.mean()









    Out[8]:





nan



In [9]:

    
pd_series.mean()









    Out[9]:





2.0



In [10]:

    
pd_series.mean(skipna=False)









    Out[10]:





nan